This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
library(tidyverse)
# install for visualizations
library(ggplot2)
# install to combine date and time
library(lubridate)
# for melting a df
library(reshape)
wego <- read_csv("../data/Route 50 Timepoint and Headway Data, 1-1-2023 through 5-12-2025.csv")
wego
# Create new date time column
wego$DATE_TIME <- ymd(wego$DATE) + hms(wego$SCHEDULED_TIME)
# Examine Data
wego
# Filter February TSP values
feb3_10_tsp <- wego |>
filter(between(DATE_TIME,
as.Date("2025-02-03 12:00:00"),
as.Date("2025-02-10 12:00:00")))
# Filter Feb-Apr TSP with buses only 2 minutes late or more
feb10_apr28_tsp <- wego |>
filter(between(DATE_TIME,
as.Date("2025-02-10 12:00:00"),
as.Date("2025-04-28 12:00:00")))
# Filter May TSP values
may5_12_tsp <- wego |>
filter(between(DATE_TIME,
as.Date("2025-05-05 12:00:00"),
as.Date("2025-05-12 12:00:00")))
may12_tsp <- wego |>
filter(DATE_TIME > as.Date("2025-05-12 12:00:00"))
# Add day of week column
wego <- wego |>
mutate(
DATE_TIME = as.POSIXct(DATE_TIME),
DAY_OF_WEEK = wday(DATE_TIME,
label = TRUE,
abbr = FALSE))
wego
NA
# Combine tsp variables into one
tsp_rows <- bind_rows(
feb3_10_tsp,
feb10_apr28_tsp,
may5_12_tsp,
may12_tsp
) |>
select('ADHERENCE_ID', 'DATE_TIME') |>
distinct() |>
mutate(tsp = 1) # Add tsp indicator column for each distinct adherence id
wego <- wego |>
left_join(
tsp_rows,
by = c('ADHERENCE_ID', 'DATE_TIME')
) |>
mutate(tsp = coalesce(tsp, 0))
wego #|> view()
NA
NA
# wego <- wego |> mutate(
# tsp_indicator = if_else(
# between(DATE_TIME,
# as.Date("2025-02-03 12:00:00"),
# as.Date("2025-02-10 12:00:00")) |
# (between(DATE_TIME,
# as.Date("2025-02-10 12:00:00"),
# as.Date("2025-04-28 12:00:00")) &
# ADHERENCE <= -2) |
# between(DATE_TIME,
# as.Date("2025-05-05 12:00:00"),
# as.Date("2025-05-12 12:00:00")), 1, 0)
#
# )
#
# wego
wego <- wego |> mutate(
HOUR = hms(SCHEDULED_TIME) |>
hour()
)
wego <- wego |>
mutate(
time_of_day = case_when(
between(HOUR, 4, 5) ~ "early_morning",
between(HOUR, 6, 8) ~ "morning_peak",
between(HOUR, 9, 14) ~ "midday",
between(HOUR, 15, 17) ~ "pm_peak",
between(HOUR, 18, 20) ~ "evening",
between(HOUR, 21, 23) ~ "late_night",
between(HOUR, 0, 3) ~ "late_night",
.default = "other"
)
)
wego
NA
tod_table = table(wego$time_of_day)
pt_tod_table <- prop.table(tod_table)
pt_tod_table
early_morning evening late_night midday morning_peak other pm_peak
0.03272224 0.12055936 0.09740419 0.36547452 0.16413139 0.03097748 0.18873082
tod_table
early_morning evening late_night midday morning_peak other pm_peak
20255 74626 60293 226228 101597 19175 116824
barplot(table(wego$time_of_day), main = "Time of day distribution")
table_tod <- pt_tod_table
# Create a color vector
color <- rainbow(nrow(table_tod))
# Set the rotation for x-axis labels to 45 degrees
par(las=2)
# Create the vertically stacked bar plot
bp <- barplot(table_tod, main = "Time of day distribution", col = color)
# Add the legend
legend("topright", legend = rownames(table_tod),cex = 0.75, fill = color)
# Add x-axis labels with a 45 degree angle
# axis(1, at=bp, labels=colnames(table_tod), las=2, cex.axis=2)
late_tod <- table(wego$time_of_day, wego$ADJUSTED_LATE_COUNT)
# Create a color vector
color <- rainbow(nrow(late_tod))
# Set the rotation for x-axis labels to 45 degrees
par(las=2)
# Create the vertically stacked bar plot
bp <- barplot(late_tod, main = "Late bus dist", col = color)
# Add the legend
legend("topright", legend = rownames(late_tod),cex = 0.9, fill = color)
# Add x-axis labels with a 45 degree angle
axis(1, at=bp, labels=colnames(late_tod), las=2, cex.axis=1)
count_tod <- wego |>
count(time_of_day)
count_tod
unique(wego$ADJUSTED_LATE_COUNT)
[1] 0 1
wego
# count_tod <- wego |>
# count(time_of_day)
# # value <- count_tod$n
# value = count_tod
# # table(wego$time_of_day)
# condition <- tod <- c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "late_night", "other") #wego$time_of_day
# specie <- wego$ADJUSTED_LATE_COUNT
#
# ggplot(wego, aes(fill=condition, y=value, x=specie)) +
# geom_bar(position="fill", stat="identity")
wego_tod_count_late <- wego |>
group_by(time_of_day, ADJUSTED_LATE_COUNT) |>
summarize(n = n())
wego_tod_count_late
wego$time_of_day <- factor(wego$time_of_day, levels = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "other"))
ggplot(wego_tod_count_late, aes(fill=time_of_day, y=n, x=factor(ADJUSTED_LATE_COUNT))) +
geom_bar(position="fill", stat="identity")+
xlab("Ontime (0) and Late (1) Buses") +
ylab("Proportion of Buses") +
ggtitle("Ontime and Late Buses Based on Time of Day")
time_day_log <- glm(ADJUSTED_LATE_COUNT ~ tsp * time_of_day,
data = wego,
family = "binomial")
summary(time_day_log)
Call:
glm(formula = ADJUSTED_LATE_COUNT ~ tsp * time_of_day, family = "binomial",
data = wego)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.55137 0.04518 -78.609 <2e-16 ***
tsp -0.06663 0.14162 -0.471 0.638
time_of_daymorning_peak 1.07271 0.04686 22.894 <2e-16 ***
time_of_daymidday 1.55349 0.04570 33.996 <2e-16 ***
time_of_daypm_peak 2.24291 0.04581 48.964 <2e-16 ***
time_of_dayevening 1.48443 0.04681 31.709 <2e-16 ***
time_of_daylate_night 0.79994 0.04869 16.430 <2e-16 ***
time_of_dayother 0.95078 0.05340 17.804 <2e-16 ***
tsp:time_of_daymorning_peak 0.13380 0.14638 0.914 0.361
tsp:time_of_daymidday -0.03235 0.14327 -0.226 0.821
tsp:time_of_daypm_peak -0.07528 0.14364 -0.524 0.600
tsp:time_of_dayevening 0.14890 0.14623 1.018 0.309
tsp:time_of_daylate_night 0.05855 0.15214 0.385 0.700
tsp:time_of_dayother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449971 on 618997 degrees of freedom
Residual deviance: 435083 on 618985 degrees of freedom
AIC: 435109
Number of Fisher Scoring iterations: 6
# wego$time_of_day <- factor(wego$time_of_day, levels = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "other"))
# wego$time_of_day <- relevel(wego$time_of_day, "early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "other")
tod_plot <- ggplot(wego_tod_count_late,
aes(fill = factor(time_of_day, levels = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "other")),
y=n,
x=factor(ADJUSTED_LATE_COUNT))) +
geom_bar(position="fill", stat="identity", color="black") +
labs(title = "Buses Often Run Late During the PM Peak", x = "", y = "Proportion of Buses", fill = "Time of Day") +
scale_x_discrete(labels=c("ontime", "late")) +
scale_fill_manual(labels = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night", "other"), values = c("#191919", "#f3e4ee", "khaki", "#b4eef0", "#e3fafb", "#b5ffd0", "grey")) +
theme(plot.title = element_text(hjust = 0.5))
tod_plot
# dev.copy(tod_plot, 'time_of_day_orig.pdf')
# # dev.off()
ggsave("time_of_day_orig.png", plot = tod_plot, width=8, height=5, dpi=300)
time_day_log <- glm(ADJUSTED_LATE_COUNT ~ tsp * time_of_day,
data = wego,
family = "binomial")
summary(time_day_log)
Call:
glm(formula = ADJUSTED_LATE_COUNT ~ tsp * time_of_day, family = "binomial",
data = wego)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.55137 0.04518 -78.609 <2e-16 ***
tsp -0.06663 0.14162 -0.471 0.638
time_of_daymorning_peak 1.07271 0.04686 22.894 <2e-16 ***
time_of_daymidday 1.55349 0.04570 33.996 <2e-16 ***
time_of_daypm_peak 2.24291 0.04581 48.964 <2e-16 ***
time_of_dayevening 1.48443 0.04681 31.709 <2e-16 ***
time_of_daylate_night 0.79994 0.04869 16.430 <2e-16 ***
time_of_dayother 0.95078 0.05340 17.804 <2e-16 ***
tsp:time_of_daymorning_peak 0.13380 0.14638 0.914 0.361
tsp:time_of_daymidday -0.03235 0.14327 -0.226 0.821
tsp:time_of_daypm_peak -0.07528 0.14364 -0.524 0.600
tsp:time_of_dayevening 0.14890 0.14623 1.018 0.309
tsp:time_of_daylate_night 0.05855 0.15214 0.385 0.700
tsp:time_of_dayother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449971 on 618997 degrees of freedom
Residual deviance: 435083 on 618985 degrees of freedom
AIC: 435109
Number of Fisher Scoring iterations: 6
lgrgmdl_tod <- coef(summary(time_day_log))
lgrgmdl_tod
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.55137191 0.04517772 -78.6089264 0.000000e+00
tsp -0.06663320 0.14161723 -0.4705162 6.379863e-01
time_of_daymorning_peak 1.07270680 0.04685579 22.8937914 5.357362e-116
time_of_daymidday 1.55348816 0.04569561 33.9964414 2.514579e-253
time_of_daypm_peak 2.24291242 0.04580693 48.9644762 0.000000e+00
time_of_dayevening 1.48442969 0.04681342 31.7094878 1.149713e-220
time_of_daylate_night 0.79994200 0.04868662 16.4304266 1.158412e-60
time_of_dayother 0.95078086 0.05340193 17.8042433 6.551173e-71
tsp:time_of_daymorning_peak 0.13379876 0.14637636 0.9140736 3.606782e-01
tsp:time_of_daymidday -0.03235218 0.14327252 -0.2258087 8.213502e-01
tsp:time_of_daypm_peak -0.07528052 0.14363627 -0.5241052 6.002053e-01
tsp:time_of_dayevening 0.14890016 0.14623470 1.0182273 3.085700e-01
tsp:time_of_daylate_night 0.05854647 0.15214032 0.3848189 7.003716e-01
summary(time_day_log)$coefficients[3, 1]
[1] 1.072707
lg_tod_table <- as_tibble(rownames_to_column(data.frame(lgrgmdl_tod)))
lg_tod_table["tsp_indicator"] = c(0,1,0,0,0,0,0,1,1,1,1,1)
Error in `[<-`:
! Assigned data `c(0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1)` must be compatible with existing data.
✖ Existing data has 13 rows.
✖ Assigned data has 12 rows.
ℹ Only vectors of size 1 are recycled.
Caused by error in `vectbl_recycle_rhs_rows()`:
! Can't recycle input of size 12 to size 13.
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.
lg_tod_table
plot <- ggplot(lg_tod_table, aes(factor(rowname), Estimate, fill = factor(tsp_indicator))) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1")
plot
ggsave("attempt_plot.png", plot = plot)
lg_tod_table |>
group_by(tsp_indicator, Estimate) |>
ggplot(ggplot2::aes(rowname, Estimate)) +
geom_bar(ggplot2::aes(fill = tsp_indicator), position = "dodge", stat="identity")
lg_tod_limted <- lg_tod_table |> slice(c(-1,-2))
lg_tod_limted
lg_tod_table |>
gather(tsp_indicator, Estimate, -rowname) |>
ggplot(aes(x=rowname, y=Estimate, fill=tsp_indicator)) +
geom_col(position = "dodge")
prob_late_tod <- wego |>
group_by(time_of_day, tsp) |>
summarize(mean_adj_late_count = mean(ADJUSTED_LATE_COUNT))
prob_late_tod
NA
wego |>
group_by(time_of_day, tsp) |>
summarise(mean(ADJUSTED_LATE_COUNT))
NA
NA
time_day_log <- glm(ADJUSTED_LATE_COUNT ~ tsp * time_of_day,
data = wego,
family = "binomial")
summary(time_day_log)
Call:
glm(formula = ADJUSTED_LATE_COUNT ~ tsp * time_of_day, family = "binomial",
data = wego)
Coefficients: (1 not defined because of singularities)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -3.55137 0.04518 -78.609 <2e-16 ***
tsp -0.06663 0.14162 -0.471 0.638
time_of_daymorning_peak 1.07271 0.04686 22.894 <2e-16 ***
time_of_daymidday 1.55349 0.04570 33.996 <2e-16 ***
time_of_daypm_peak 2.24291 0.04581 48.964 <2e-16 ***
time_of_dayevening 1.48443 0.04681 31.709 <2e-16 ***
time_of_daylate_night 0.79994 0.04869 16.430 <2e-16 ***
time_of_dayother 0.95078 0.05340 17.804 <2e-16 ***
tsp:time_of_daymorning_peak 0.13380 0.14638 0.914 0.361
tsp:time_of_daymidday -0.03235 0.14327 -0.226 0.821
tsp:time_of_daypm_peak -0.07528 0.14364 -0.524 0.600
tsp:time_of_dayevening 0.14890 0.14623 1.018 0.309
tsp:time_of_daylate_night 0.05855 0.15214 0.385 0.700
tsp:time_of_dayother NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 449971 on 618997 degrees of freedom
Residual deviance: 435083 on 618985 degrees of freedom
AIC: 435109
Number of Fisher Scoring iterations: 6
tsp_tod <- with(wego, data.frame(time_of_day = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night"), tsp=1))
tsp_on_tod <- predict(time_day_log, tsp_tod, type="response")
prcnt_tsp_on_tod <- tsp_on_tod*100
prcnt_tsp_on_tod
1 2 3 4 5 6
2.613480 8.229999 10.940150 18.994413 12.082133 5.955143
tod_affect <- with(wego, data.frame(time_of_day = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night"), tsp=0))
tod_alone <- predict(time_day_log, tod_affect, type="response")
prcnt_tsp_alone <- tod_alone*100
prcnt_tsp_alone
1 2 3 4 5 6
2.788536 7.736743 11.942529 21.274474 11.235163 6.000594
# pred <- tod_alone$fit
plot(tod_alone, type="l", ylab="Predicted Probability to Vote", xlab="Age", bty="n")
tod_alone
1 2 3 4 5 6
0.02611264 0.07374139 0.11401405 0.20471454 0.10731519 0.05668719
df <- cbind.data.frame("time_of_day" = c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night"), "tsp_on" = tod_alone, "tod_w_tsp" = tsp_tod)
barplot(height = c(df$tsp_on, df$tod_w_tsp),
names.arg = df$time_of_day,
main = "With and Without TSP, TIME OF DAY",
xlab = "Time of Day",
ylab = "Probability of Late Buses",
col = c("#191919", "#f3e4ee", "khaki", "#b4eef0", "#e3fafb", "#b5ffd0", "grey"),
las = 2)
sbs_tod_tsp <- rbind(prcnt_tsp_alone,prcnt_tsp_on_tod)
sbs_tod_tsp
1 2 3 4 5 6
prcnt_tsp_alone 2.788536 7.736743 11.94253 21.27447 11.23516 6.000594
prcnt_tsp_on_tod 2.613480 8.229999 10.94015 18.99441 12.08213 5.955143
barplot(sbs_tod_tsp,beside=T)
sbs_tod_tsp_rn <- cbind(rownames_list = rownames(sbs_tod_tsp), sbs_tod_tsp)
sbs_tod_tsp_rn
rownames_list 1 2 3 4 5 6
prcnt_tsp_alone "prcnt_tsp_alone" "2.78853601867628" "7.73674346829416" "11.9425293614457" "21.2744740830849" "11.235162592402" "6.00059448614359"
prcnt_tsp_on_tod "prcnt_tsp_on_tod" "2.61348005529302" "8.22999908157393" "10.9401497291216" "18.9944134080031" "12.0821334668027" "5.95514307811133"
prcnt_tsp_alone
1 2 3 4 5 6
2.788536 7.736743 11.942529 21.274474 11.235163 6.000594
tod_affect
mid_tod_tsp <- list(
c( time_of_day = "early_morning", tsp = 0),
c( time_of_day = "early_morning", tsp = 1),
c( time_of_day = "morning_peak", tsp = 0),
c( time_of_day = "morning_peak", tsp = 1),
c( time_of_day = "midday", tsp = 0),
c( time_of_day = "midday", tsp = 1),
c( time_of_day = "pm_peak", tsp = 0),
c( time_of_day = "pm_peak", tsp = 1),
c( time_of_day = "evening", tsp = 0),
c( time_of_day = "evening", tsp = 1),
c( time_of_day = "late_night", tsp = 0),
c( time_of_day = "late_night", tsp = 1)
)
mid_tod_tsp
[[1]]
time_of_day tsp
"early_morning" "0"
[[2]]
time_of_day tsp
"early_morning" "1"
[[3]]
time_of_day tsp
"morning_peak" "0"
[[4]]
time_of_day tsp
"morning_peak" "1"
[[5]]
time_of_day tsp
"midday" "0"
[[6]]
time_of_day tsp
"midday" "1"
[[7]]
time_of_day tsp
"pm_peak" "0"
[[8]]
time_of_day tsp
"pm_peak" "1"
[[9]]
time_of_day tsp
"evening" "0"
[[10]]
time_of_day tsp
"evening" "1"
[[11]]
time_of_day tsp
"late_night" "0"
[[12]]
time_of_day tsp
"late_night" "1"
middle_tod_tsp <- bind_rows(mid_tod_tsp)
middle_tod_tsp
middle_tod_tsp <- middle_tod_tsp |>
mutate(tsp = as.numeric(tsp))
wego
# mid_tod_tsp
tod_stand_alone <- predict(time_day_log, middle_tod_tsp, type="response")
tod_stand_alone
1 2 3 4 5 6 7 8 9 10 11 12
0.02788536 0.02613480 0.07736743 0.08229999 0.11942529 0.10940150 0.21274474 0.18994413 0.11235163 0.12082133 0.06000594 0.05955143
middle_tod_tsp <- middle_tod_tsp |>
mutate(Probs = tod_stand_alone)
probs_tod_tsp <- ggplot(middle_tod_tsp,
aes(x=fct_relevel(time_of_day, c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night")), y=Probs,
fill=factor(tsp)
)
) +
geom_bar(position="dodge", stat="identity", color="black") +
labs(title = "The Effect of TSP on the Probability of Buses Being Late", x = "time_of_day", y = "Probability of Buses Being Late", fill = "tsp") +
scale_fill_manual(labels = c("tsp_off", "tsp_on"), values = c("#b4eef0", "khaki")) +
theme(plot.title = element_text(hjust = 0.5))
probs_tod_tsp
NA
NA
Probs_tod_tsp_notitle <- ggplot(middle_tod_tsp,
aes(x=fct_relevel(time_of_day, c("early_morning", "morning_peak", "midday", "pm_peak", "evening", "late_night")), y=Probs,
fill=factor(tsp)
)
) +
geom_bar(position="dodge", stat="identity", color="black") +
labs(title = "", x = "time_of_day", y = "Probability of Buses Being Late", fill = "tsp") +
scale_fill_manual(labels = c("tsp_off", "tsp_on"), values = c("#b4eef0", "khaki")) +
theme(plot.title = element_text(hjust = 0.5))
Probs_tod_tsp_notitle
NA
NA
ggsave("probs_lt_tsp_tod_nttl.png", plot = Probs_tod_tsp_notitle, width=8, height=5, dpi=300)
ggsave("probs_lt_tsp_tod.png", plot = probs_tod_tsp, width=8, height=5, dpi=300)
time_day_log <- glm(ADJUSTED_LATE_COUNT ~ tsp * time_of_day * day_of_week,
data = wego,
family = "binomial")
Error in eval(predvars, data, env) : object 'day_of_week' not found